// FSRCNNX_x2_8-0-4-1
// 移植自 https://github.com/igv/FSRCNN-TensorFlow


//!MAGPIE EFFECT
//!VERSION 2
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2


//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D featureMap1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D featureMap2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex3;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex4;


//!SAMPLER
//!FILTER POINT
SamplerState sam;


//!PASS 1
//!DESC feature map
//!IN INPUT
//!OUT featureMap1, featureMap2
//!BLOCK_SIZE 32, 24
//!NUM_THREADS 128

#define SH_PIXELS_X  (MP_BLOCK_WIDTH + 4)
#define SH_PIXELS_Y  (MP_BLOCK_HEIGHT + 4)

groupshared float shPixelsY[SH_PIXELS_Y][SH_PIXELS_X];

float GetLuma(float3 rgb) {
	return dot(float3(0.299f, 0.587f, 0.114f), rgb);
}

void Pass1(uint2 blockStart, uint3 threadId) {
	float2 inputPt = GetInputPt();
	uint2 inputSize = GetInputSize();
	uint i;

	for (i = threadId.x * 2; i < SH_PIXELS_X * SH_PIXELS_Y / 2; i += MP_NUM_THREADS_X * 2) {
		uint2 pos = uint2(i % SH_PIXELS_X, i / SH_PIXELS_X * 2);
		const float2 tpos = (blockStart + pos - 1.5f) * inputPt;

		const float4 sr = INPUT.GatherRed(sam, tpos);
		const float4 sg = INPUT.GatherGreen(sam, tpos);
		const float4 sb = INPUT.GatherBlue(sam, tpos);

		shPixelsY[pos.y][pos.x] = GetLuma(float3(sr.w, sg.w, sb.w));
		shPixelsY[pos.y][pos.x + 1] = GetLuma(float3(sr.z, sg.z, sb.z));
		shPixelsY[pos.y + 1][pos.x] = GetLuma(float3(sr.x, sg.x, sb.x));
		shPixelsY[pos.y + 1][pos.x + 1] = GetLuma(float3(sr.y, sg.y, sb.y));
	}

	GroupMemoryBarrierWithGroupSync();

	for (i = threadId.x; i < MP_BLOCK_WIDTH * MP_BLOCK_HEIGHT; i += MP_NUM_THREADS_X) {
		const uint2 pos = uint2(i % MP_BLOCK_WIDTH, i / MP_BLOCK_WIDTH);
		const uint2 destPos = blockStart + pos;

		if (destPos.x >= inputSize.x || destPos.y >= inputSize.y) {
			continue;
		}

		float src[5][5];
		[unroll]
		for (int i = 0; i < 5; ++i) {
			[unroll]
			for (int j = 0; j < 5; ++j) {
				src[j][i] = shPixelsY[pos.y + i][pos.x + j];
			}
		}

		float4 target1 = float4(-0.1572492271661758, -0.0120896836742759, 0.0061487639322877, -0.2852848768234253);
		target1 += float4(-0.0047900392673910, 0.0537447109818459, -0.0000247144635068, 0.0066653941757977) * src[0][0];
		target1 += float4(0.0073144687339664, -0.0309004038572311, -0.0109181385487318, -0.0092840325087309) * src[0][1];
		target1 += float4(0.0591700896620750, 0.1974907070398331, -0.0197357516735792, -0.0546554848551750) * src[0][2];
		target1 += float4(-0.0011764382943511, -0.0299451071768999, 0.0229587312787771, 0.0021908886265010) * src[0][3];
		target1 += float4(0.0098101310431957, 0.0080995410680771, -0.0030452020000666, -0.0132035519927740) * src[0][4];
		target1 += float4(-0.0168330334126949, -0.0743711441755295, -0.0259261634200811, 0.0234480481594801) * src[1][0];
		target1 += float4(0.0239933785051107, 0.1896541714668274, 0.0207756329327822, -0.0370332375168800) * src[1][1];
		target1 += float4(0.0094799501821399, -0.0652511194348335, -0.0004292793164495, -0.0726212188601494) * src[1][2];
		target1 += float4(0.0297284796833992, -0.1210186630487442, -0.0202929321676493, -0.0574462898075581) * src[1][3];
		target1 += float4(-0.0318185277283192, 0.0840775370597839, 0.0110451309010386, 0.0415569432079792) * src[1][4];
		target1 += float4(-0.0253141783177853, 0.1168256178498268, 0.1159729585051537, 0.0963164269924164) * src[2][0];
		target1 += float4(-0.1103615835309029, -0.0276833958923817, -0.4999594092369080, 0.1053867191076279) * src[2][1];
		target1 += float4(1.1100435256958008, 0.0646764487028122, 0.0154005717486143, 0.8891586661338806) * src[2][2];
		target1 += float4(0.1229330673813820, 0.1719468832015991, 0.5730338096618652, -0.1645544171333313) * src[2][3];
		target1 += float4(-0.0090442728251219, -0.3023961782455444, -0.1589493155479431, 0.0418574027717113) * src[2][4];
		target1 += float4(0.0031942036002874, -0.1310926079750061, 0.0075543406419456, -0.0016449346439913) * src[3][0];
		target1 += float4(-0.0995150282979012, -0.0701921209692955, -0.0130895879119635, 0.1344170123338699) * src[3][1];
		target1 += float4(0.0060519003309309, -0.1533465683460236, 0.0114194005727768, 0.0264683905988932) * src[3][2];
		target1 += float4(0.0244008023291826, 0.1881769001483917, -0.0206351149827242, -0.0628309547901154) * src[3][3];
		target1 += float4(0.0075713125988841, 0.0508594363927841, 0.0430423170328140, -0.0124188791960478) * src[3][4];
		target1 += float4(-0.0166875869035721, -0.0047865519300103, 0.0006719123339280, 0.0316803231835365) * src[4][0];
		target1 += float4(-0.0058461269363761, 0.0990798473358154, -0.0177743826061487, -0.0066122291609645) * src[4][1];
		target1 += float4(-0.0972401946783066, -0.0225446373224258, -0.0037693574558944, 0.1953062713146210) * src[4][2];
		target1 += float4(-0.0216837190091610, -0.1824268400669098, 0.0069816261529922, 0.0283037684857845) * src[4][3];
		target1 += float4(-0.0025767991319299, 0.0459827110171318, -0.0080216089263558, 0.0084134787321091) * src[4][4];

		float4 target2 = float4(0.0541447550058365, 0.0088306749239564, -0.0112389577552676, -0.0127860950306058);
		target2 += float4(0.0142660010606050, 0.0137931071221828, 0.0061188107356429, -0.0104134222492576) * src[0][0];
		target2 += float4(0.0147292809560895, -0.0289912857115269, 0.0266769435256720, 0.0933856964111328) * src[0][1];
		target2 += float4(-0.1734338253736496, 0.1116316691040993, -0.1973157376050949, -0.0581855811178684) * src[0][2];
		target2 += float4(0.0347507223486900, -0.0341566652059555, 0.0061667622067034, 0.0075258882716298) * src[0][3];
		target2 += float4(0.0069884369149804, -0.0194250214844942, 0.0080830128863454, -0.0036874092184007) * src[0][4];
		target2 += float4(0.0233764201402664, 0.0344744995236397, 0.0162145942449570, 0.0979529991745949) * src[1][0];
		target2 += float4(0.1280796974897385, -0.1018339172005653, -0.0132977198809385, -0.0019474622095004) * src[1][1];
		target2 += float4(0.4286882579326630, 0.1222677752375603, 0.7046694159507751, 0.0945475697517395) * src[1][2];
		target2 += float4(0.1107441782951355, -0.0134433070197701, -0.0174900908023119, -0.1686445474624634) * src[1][3];
		target2 += float4(0.0321478620171547, 0.0065357843413949, 0.0300805997103453, 0.0420113280415535) * src[1][4];
		target2 += float4(-0.1240341588854790, 0.0950303301215172, -0.0129648456349969, -0.2681856453418732) * src[2][0];
		target2 += float4(0.4846960902214050, 0.0351924635469913, 0.0223043337464333, -0.1273630708456039) * src[2][1];
		target2 += float4(-1.9379507303237915, -0.2444442063570023, 0.0291962660849094, -0.3835578560829163) * src[2][2];
		target2 += float4(0.6396278142929077, -0.0765938311815262, -0.0552659817039967, 0.4393545985221863) * src[2][3];
		target2 += float4(-0.1969728022813797, -0.0607173256576061, 0.0131113547831774, 0.0542017817497253) * src[2][4];
		target2 += float4(0.0091696009039879, -0.0031533432193100, -0.0368777588009834, -0.0459998287260532) * src[3][0];
		target2 += float4(0.1096992492675781, 0.2597902715206146, 0.0304869692772627, -0.0195200722664595) * src[3][1];
		target2 += float4(0.2889648377895355, -0.4275591969490051, -0.7414156794548035, 0.2695442438125610) * src[3][2];
		target2 += float4(0.0892018377780914, -0.0229137558490038, 0.0244414471089840, -0.1926898956298828) * src[3][3];
		target2 += float4(0.0576358586549759, 0.0027846973389387, -0.0036861505359411, -0.0253547113388777) * src[3][4];
		target2 += float4(0.0159624069929123, 0.0319602824747562, 0.0019470085389912, 0.0089780492708087) * src[4][0];
		target2 += float4(0.0552792511880398, 0.0543054342269897, 0.0134062822908163, 0.0545728243887424) * src[4][1];
		target2 += float4(-0.1170092225074768, 0.1963327825069427, 0.1503890156745911, 0.1891828328371048) * src[4][2];
		target2 += float4(-0.0084421783685684, 0.1297017931938171, -0.0330600887537003, -0.0942063704133034) * src[4][3];
		target2 += float4(0.0118440408259630, -0.0337875857949257, 0.0055063469335437, 0.0254479162395000) * src[4][4];

		featureMap1[destPos] = target1;
		featureMap2[destPos] = target2;
	}
}


//!PASS 2
//!DESC mapping 1
//!IN featureMap1, featureMap2
//!OUT tex1, tex2
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass2(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [tl, tc, tr]
	// [ml, mc, mr]
	// [bl, bc, br]
	float4 tl1 = featureMap1.SampleLevel(sam, pos - inputPt, 0);
	float4 ml1 = featureMap1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl1 = featureMap1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc1 = featureMap1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc1 = featureMap1.SampleLevel(sam, pos, 0);
	float4 bc1 = featureMap1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr1 = featureMap1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr1 = featureMap1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br1 = featureMap1.SampleLevel(sam, pos + inputPt, 0);

	float4 tl2 = featureMap2.SampleLevel(sam, pos - inputPt, 0);
	float4 ml2 = featureMap2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl2 = featureMap2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc2 = featureMap2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc2 = featureMap2.SampleLevel(sam, pos, 0);
	float4 bc2 = featureMap2.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr2 = featureMap2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr2 = featureMap2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br2 = featureMap2.SampleLevel(sam, pos + inputPt, 0);

	float4 target1 = float4(-0.0445119962096214, -0.7632357478141785, 0.0156328510493040, -0.2424548566341400);
	target1 += mul(tl1, float4x4(0.1279004216194153, -0.0275541823357344, 0.2275633513927460, 0.2241709381341934, 0.0197204202413559, -0.0456816256046295, -0.1296672523021698, 0.0564568229019642, -0.0241488646715879, -0.0237508192658424, -0.1899632662534714, 0.4177669584751129, -0.1814560592174530, -0.0526473335921764, 0.1154382973909378, -0.0715614855289459));
	target1 += mul(tl2, float4x4(-0.0660311505198479, 0.0416736751794815, 0.3146112561225891, 0.1472041457891464, -0.3456672728061676, -0.0055983816273510, 0.0022350433282554, 0.0819796621799469, 0.0057485047727823, 0.1532524228096008, 0.0204557459801435, -0.2500547170639038, -0.0524359568953514, -0.1911625266075134, -0.1078366711735725, -0.1296254843473434));
	target1 += mul(ml1, float4x4(0.0904538556933403, -0.0150672039017081, 0.3322310745716095, 0.0638923197984695, 0.5975797176361084, -0.2452044337987900, -0.4947478473186493, -0.0783191770315170, 0.5771877169609070, -0.0870653912425041, -0.8966570496559143, -0.2140965163707733, -0.0493861362338066, -0.0380848757922649, -0.1345319598913193, -0.0186063013970852));
	target1 += mul(ml2, float4x4(-0.2523841261863708, 0.1387074738740921, 0.7878478765487671, -0.2251627445220947, 0.2277439534664154, 0.5417668819427490, 0.0866540968418121, -0.1707777529954910, -0.0598246827721596, -0.4717158675193787, -1.2242834568023682, 0.0454643070697784, -0.3503442704677582, 0.0573085807263851, 0.2530198395252228, -0.0207283068448305));
	target1 += mul(bl1, float4x4(0.0168380383402109, -0.2142438590526581, -0.0207892972975969, 0.3628533780574799, 0.2431225180625916, 0.3098322153091431, 0.4073205888271332, -0.2762102782726288, -0.0197229012846947, 0.1305596232414246, -0.5697882771492004, -0.2976251542568207, -0.0551432967185974, 0.2614036500453949, -0.1410341411828995, -0.2906406223773956));
	target1 += mul(bl2, float4x4(-0.0498303361237049, 0.0224859956651926, 0.1952174901962280, -0.0311204437166452, 0.2501715123653412, -0.5893352627754211, -1.0793941020965576, 0.0160885509103537, 0.5081620812416077, 0.0482814386487007, 0.0546359121799469, -0.0501569248735905, 0.1400523334741592, -0.0106841633096337, -0.0940591320395470, -0.1791856139898300));
	target1 += mul(tc1, float4x4(0.0393299944698811, 0.2232691347599030, -0.1055066883563995, -0.1607919186353683, -0.1567825973033905, -0.0042221010662615, -0.0548228211700916, 0.2352052628993988, 0.1483389288187027, 0.7503526806831360, 0.0797731876373291, -0.0049001369625330, -0.0242983382195234, -0.0308702979236841, 0.0828925222158432, 0.0561857633292675));
	target1 += mul(tc2, float4x4(0.0926392748951912, -0.0418718457221985, -0.3060409128665924, -0.1883587390184402, 0.0284292586147785, -0.3584854304790497, -0.7909982800483704, -0.0187337957322598, -0.2496993243694305, -0.7520986795425415, 0.3771523833274841, -0.0259053874760866, 0.0337998159229755, 0.2209153026342392, 0.0708771497011185, -0.2814430892467499));
	target1 += mul(mc1, float4x4(-0.5287809371948242, 0.5777525901794434, 0.0880500450730324, -0.8452472090721130, -0.3393408954143524, -0.2273543328046799, -0.1298527419567108, 0.4990308582782745, 1.2613251209259033, -0.7636719942092896, 1.5694186687469482, -0.4087363779544830, 0.0874531939625740, 0.7067158818244934, -0.3419588804244995, -0.3265531957149506));
	target1 += mul(mc2, float4x4(0.8229957222938538, -0.1236215904355049, -0.1859253048896790, 1.6684840917587280, 0.2000777721405029, -0.1239093989133835, 1.5623438358306885, 0.1779983490705490, 0.1017884835600853, -0.3707404434680939, 1.0626678466796875, -0.3124029338359833, 0.0659058541059494, -0.3585464656352997, -0.1866402775049210, 0.6733445525169373));
	target1 += mul(bc1, float4x4(-0.5544115900993347, -0.1892931908369064, 0.2460739761590958, -0.1056193932890892, -0.4318082630634308, 0.1257930994033813, -0.2672747671604156, -0.1690235435962677, 0.0018221997888759, -0.4397548139095306, -0.3007801771163940, 0.1068472340703011, 0.3506655991077423, 0.1143834441900253, 0.1363849341869354, -0.1417382210493088));
	target1 += mul(bc2, float4x4(-0.0505668744444847, 0.1831464916467667, 0.3957343697547913, -0.2295413911342621, -0.3892803490161896, 0.5436951518058777, 0.1217770799994469, 0.0223295800387859, -0.4462866187095642, -0.4055982232093811, -0.3771279454231262, 0.0807068347930908, 0.2116729617118835, 0.0281026475131512, -0.0229265503585339, 0.2868605256080627));
	target1 += mul(tr1, float4x4(0.1962712109088898, -0.2373334914445877, -2.5208437442779541, -0.1988540291786194, 0.2224564403295517, -0.1783192902803421, -0.3962321281433105, -0.1685980409383774, 0.1910390257835388, 0.2554391324520111, 0.4586416482925415, 0.2779130041599274, -0.2002453953027725, -0.0061091855168343, 1.3808131217956543, 0.0434907525777817));
	target1 += mul(tr2, float4x4(-0.0307611189782619, -0.0524470545351505, -0.5897512435913086, -0.0816674903035164, 0.4052906930446625, 0.2542210817337036, -1.9041002988815308, 0.0835462361574173, -0.2484460622072220, -0.0184739269316196, 0.4510098397731781, 0.2587619423866272, 0.1537084281444550, 0.1503131389617920, -0.0742949545383453, 0.0613216012716293));
	target1 += mul(mr1, float4x4(0.1772638261318207, 0.0948876664042473, 0.0083848545327783, -0.2919732332229614, 0.2566950321197510, 0.0288751143962145, -0.4624863862991333, -0.0608786940574646, 0.3310996592044830, -0.0104284398257732, 0.6334818005561829, -0.0027201652992517, -0.0342350602149963, 0.1938806027173996, -0.2464301586151123, 0.0125883584842086));
	target1 += mul(mr2, float4x4(0.4839433431625366, -0.0502159744501114, -1.1114163398742676, -0.3965759575366974, 0.2117286175489426, 0.0414481423795223, -0.1332397013902664, -0.0549883767962456, -0.1275007277727127, 0.7844302654266357, -0.0095163453370333, 0.0961041301488876, -0.4759134948253632, -0.4284025132656097, -0.2072399407625198, -0.3953579664230347));
	target1 += mul(br1, float4x4(0.1605869531631470, -0.1715892106294632, 0.0865620598196983, -0.0464400537312031, -0.2688548862934113, 0.1722514480352402, 0.0167612321674824, -0.0032994034700096, -0.3451044559478760, -0.2280300110578537, -0.0029796555172652, -0.1597652435302734, 0.0500137843191624, 0.1023071259260178, -0.0407028235495090, 0.2228624969720840));
	target1 += mul(br2, float4x4(0.6999920010566711, 0.0839441940188408, 0.0815469548106194, -0.1509176045656204, -0.0690853074193001, -0.3200871348381042, 0.0780162736773491, -0.1449639797210693, 0.2868815064430237, 0.3962450027465820, -0.3439113497734070, 0.2657423913478851, 0.0988137871026993, 0.3471299111843109, -0.2186402678489685, -0.0648017078638077));
	target1 = max(target1, 0) + float4(1.0311057567596436, 0.1051208898425102, 0.1158760935068130, 0.0466635078191757) * min(target1, 0);

	float4 target2 = float4(0.0713458731770515,-0.1403961777687073,-0.0019562745001167,0.0153338573873043);
	target2 += mul(tl1, float4x4(-0.0950641855597496, -0.1496641039848328, -0.0653550028800964, 0.0655386000871658, -0.0118882004171610, 0.2012491524219513, -0.2844599783420563, -0.4794720113277435, 0.1128025799989700, -0.0173030979931355, -0.0558849945664406, -0.2957552075386047, 0.0128202112391591, 0.0199047476053238, -0.0091027505695820, -0.0789640173316002));
	target2 += mul(tl2, float4x4(0.1597457975149155, -0.0476507246494293, 0.1466529071331024, 0.0859163030982018, 0.0797316282987595, -0.3380981683731079, 0.2370245009660721, -0.1145931258797646, -0.0352988094091415, -0.0444888733327389, -0.2100716233253479, 0.1305520236492157, -0.1359029710292816, 0.1097442805767059, 0.0449938289821148, -0.1155664771795273));
	target2 += mul(ml1, float4x4(-0.0333916284143925, 0.2415594160556793, 0.0520512908697128, 0.1228107511997223, -0.0491011217236519, 0.4408806562423706, 0.4631956815719604, 0.2014560103416443, -0.3688595592975616, 0.0367180295288563, 0.2484581321477890, -0.1113442853093147, 0.1283355057239532, 0.0418004281818867, -0.0171243026852608, -0.1231943219900131));
	target2 += mul(ml2, float4x4(0.3493446409702301, 0.4550022482872009, 0.0368724688887596, 0.0748724937438965, 0.5001406073570251, 0.0145555436611176, 0.1236629858613014, 0.3143120706081390, -0.1951988488435745, -0.0157914645969868, 0.0937998965382576, -0.2233840376138687, 0.5033411383628845, -0.3183194100856781, -0.2259195148944855, 0.3639536798000336));
	target2 += mul(bl1, float4x4(-0.0742707476019859, -0.1287801116704941, -0.2533137500286102, 0.0666435658931732, -0.0185621567070484, 0.1427449285984039, -0.0724751204252243, -0.0781485065817833, -0.2270648330450058, -0.2314778864383698, 0.3814929425716400, -0.1655400246381760, 0.0408568829298019, -0.1139645278453827, 0.1797397136688232, -0.0245632305741310));
	target2 += mul(bl2, float4x4(0.1184135973453522, 0.0439366139471531, 0.0225226897746325, -0.0038526873104274, 0.1292685419321060, 0.0629177838563919, 0.3455114960670471, -0.1857204884290695, -0.4921502172946930, -0.1171003505587578, 0.0188624169677496, -0.1101682260632515, 0.0676844567060471, 0.5154085755348206, -0.0898379907011986, 0.3413280248641968));
	target2 += mul(tc1, float4x4(-0.2631838321685791, 0.0215514600276947, 0.3092688918113708, -0.0200904365628958, 0.0678770467638969, 0.1769931465387344, -0.3653681278228760, -0.3274513185024261, 0.4608019888401031, -0.1544784456491470, 0.1189439669251442, 0.7015876173973083, 0.2732816934585571, -0.0545057803392410, -0.3474545478820801, -0.0253226496279240));
	target2 += mul(tc2, float4x4(0.0994316861033440, 0.0642566010355949, 0.2031503319740295, 0.2276959568262100, -0.1094077304005623, 0.4463521838188171, 0.0921792611479759, -0.3033096492290497, -0.0953373983502388, -0.1331395804882050, 0.2615413069725037, -0.2874414622783661, -0.0389687754213810, 0.0338272154331207, 0.2804331183433533, -0.3443813025951385));
	target2 += mul(mc1, float4x4(-0.1806042939424515, -0.4840798676013947, 0.4222546219825745, 0.1238701492547989, 0.0117481639608741, -0.5986865758895874, 0.3057619929313660, 0.1934896260499954, -0.7086342573165894, -0.8567376136779785, 0.6944998502731323, -1.4599204063415527, 0.0886754393577576, -0.4293498098850250, -0.1524195969104767, 0.2418079674243927));
	target2 += mul(mc2, float4x4(2.1706113815307617, 0.3525652289390564, -0.7008359432220459, -0.4825965166091919, -0.3203429281711578, 0.8500943183898926, -0.7993509769439697, 0.4329842329025269, 0.2106771767139435, 1.1103280782699585, 1.2092385292053223, 1.4814503192901611, -0.4147390127182007, -0.7046836614608765, -0.1443170011043549, -0.6811133027076721));
	target2 += mul(bc1, float4x4(-0.1489356607198715, 0.1400019824504852, 0.2425604313611984, -0.2098473459482193, -0.1580564379692078, 0.1463224738836288, -0.2187854647636414, 0.5174596905708313, -0.0143817225471139, -0.0362622961401939, -0.0068237944506109, 0.4749472737312317, 0.2914732992649078, -0.3306328952312469, -0.2444777786731720, -0.1171946674585342));
	target2 += mul(bc2, float4x4(0.0455239675939083, 0.3496046066284180, 0.1297491937875748, -0.2541095912456512, 0.3605501055717468, 0.2339573651552200, -0.0188565086573362, -0.0526181310415268, 0.1471424549818039, 0.8212822079658508, 0.0819099843502045, -0.0851665437221527, 0.3739568293094635, 0.1304695755243301, 0.1481167376041412, -0.2134698331356049));
	target2 += mul(tr1, float4x4(-0.2076720446348190, -0.0932599306106567, 0.0648527294397354, -0.2374770641326904, -0.0927826911211014, 0.1848200261592865, 0.4131188094615936, 0.3280069231987000, -0.2099185734987259, 0.2130926996469498, -0.0362745784223080, 0.0191331822425127, 0.1590368449687958, 0.0303016249090433, 0.1207325309514999, 0.2451425045728683));
	target2 += mul(tr2, float4x4(-0.0135009605437517, -0.0101303057745099, 0.0752487555146217, 0.0533373840153217, -0.0253537259995937, 0.1318614929914474, -0.1263181120157242, 0.0249524712562561, -0.1477261483669281, 0.3236559033393860, 0.0773291289806366, -0.1439673304557800, -0.2005890905857086, 0.0892757251858711, 0.0398719944059849, 0.3675192892551422));
	target2 += mul(mr1, float4x4(-0.0193535499274731, -0.2256918102502823, 0.0341436080634594, 0.0795947611331940, 0.1496857404708862, -0.2784725725650787, -0.0582313314080238, -0.2786065340042114, -0.1666128039360046, -0.6534121036529541, 0.2695854306221008, -0.0179719906300306, 0.0015976354479790, 0.0139929885044694, -0.1706486046314240, -0.3274765610694885));
	target2 += mul(mr2, float4x4(-0.7170836329460144, 0.0868831276893616, 0.1829078495502472, -0.0076045366004109, 0.1525912433862686, -0.2558896839618683, 0.0893209800124168, -0.3426039516925812, -0.2871107757091522, -0.2445062994956970, 0.1676304638385773, 0.2116415053606033, 0.0883995518088341, -0.3880331516265869, 0.2636835277080536, -0.2514505982398987));
	target2 += mul(br1, float4x4(-0.1861270815134048, 0.2000686377286911, -0.1501186788082123, 0.1525203883647919, 0.1969228833913803, 0.1174068301916122, -0.1281060427427292, -0.0854888409376144, 0.0290613435208797, -0.0538076497614384, -0.0251582786440849, 0.0692845508456230, 0.0384319014847279, 0.2888138592243195, 0.1151804402470589, 0.0990421250462532));
	target2 += mul(br2, float4x4(-0.0344385802745819, 0.1270371377468109, 0.0922426953911781, -0.0426749102771282, -0.1656492203474045, -0.3273328542709351, -0.0282224025577307, 0.1099396124482155, -0.1113230437040329, 0.2943290174007416, -0.2181112915277481, -0.3177657723426819, -0.1096536740660667, -0.0508293099701405, -0.0256164856255054, -0.0388228967785835));
	target2 = max(target2, 0) + float4(0.7142407894134521, 0.0686190053820610, 0.3999933302402496, -1.0247212648391724) * min(target2, 0);

	tex1[gxy] = target1;
	tex2[gxy] = target2;
}


//!PASS 3
//!DESC mapping 2
//!IN tex1, tex2
//!OUT tex3, tex4
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass3(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [tl, tc, tr]
	// [ml, mc, mr]
	// [bl, bc, br]
	float4 tl1 = tex1.SampleLevel(sam, pos - inputPt, 0);
	float4 ml1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc1 = tex1.SampleLevel(sam, pos, 0);
	float4 bc1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br1 = tex1.SampleLevel(sam, pos + inputPt, 0);

	float4 tl2 = tex2.SampleLevel(sam, pos - inputPt, 0);
	float4 ml2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc2 = tex2.SampleLevel(sam, pos, 0);
	float4 bc2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br2 = tex2.SampleLevel(sam, pos + inputPt, 0);

	float4 target1 = float4(0.0203563515096903,0.1902436912059784,-0.0757935121655464,0.0393617525696754);
	target1 += mul(tl1, float4x4(-0.1080558672547340, -0.0400269515812397, 0.1042881682515144, -0.1994346678256989, 0.0172465778887272, -0.0829331055283546, -0.1278677284717560, -0.0762506872415543, -0.0593080408871174, -0.0305212251842022, 0.1326192617416382, -0.3380933105945587, -0.0722763314843178, -0.1975518912076950, -0.0223602931946516, 0.2251029163599014));
	target1 += mul(tl2, float4x4(0.1747678220272064, 0.0297168865799904, 0.1054855734109879, 0.0803295820951462, -0.0338115766644478, -0.3885377943515778, -0.3540246784687042, -0.0719623491168022, -0.0656022280454636, -0.0469004511833191, 0.1379419565200806, 0.0319863893091679, 0.0799935683608055, -0.0099127553403378, 0.1698455959558487, -0.0108015276491642));
	target1 += mul(ml1, float4x4(0.1587898135185242, 0.3995443880558014, -0.0333226583898067, 0.2373267263174057, -0.1616930961608887, 0.0659186244010925, 0.0141129801049829, -0.0541022196412086, -0.5743742585182190, 0.1121487766504288, 0.4259817600250244, 0.0280795227736235, -0.3721714317798615, -0.3496374189853668, 0.0997273251414299, -0.0079920450225472));
	target1 += mul(ml2, float4x4(0.0928084030747414, 0.3107658624649048, 0.1375299990177155, 0.1550617516040802, -0.0780353918671608, -0.0102957757189870, -0.2056752145290375, -0.3927979469299316, -1.2112152576446533, 0.0213295854628086, 0.1396545022726059, 0.0492016039788723, -0.0569122135639191, -0.1691886335611343, -0.1535325646400452, 0.2800904810428619));
	target1 += mul(bl1, float4x4(0.2494744062423706, -0.0363066755235195, 0.0959179550409317, -0.0048101749271154, -0.0195793900638819, 0.0451166369020939, 0.1470773071050644, -0.0050059854984283, 0.2886958122253418, -0.3221147954463959, -0.7062104344367981, 0.1646659970283508, -0.0092520527541637, -0.1254461258649826, 0.0217506736516953, -0.0678806379437447));
	target1 += mul(bl2, float4x4(-0.0686557441949844, -0.0414490625262260, -0.1855954080820084, 0.0264346338808537, -0.0296857114881277, -0.0431593284010887, 0.0669397041201591, -0.0946076661348343, -0.2036914378404617, -0.1336101740598679, -0.2099903970956802, -0.1327936947345734, -0.1002155169844627, -0.0368575826287270, -0.1660962998867035, 0.0728288888931274));
	target1 += mul(tc1, float4x4(0.5504320859909058, 0.2939232587814331, 0.4704743027687073, 0.2129514217376709, 0.0843106731772423, -0.1978624463081360, -0.3298224806785583, 0.1919094175100327, 0.1980742365121841, -0.0644423812627792, 0.0091170109808445, -0.2124856859445572, 0.0804558470845222, -0.1130188927054405, -0.6276652812957764, 0.1861163526773453));
	target1 += mul(tc2, float4x4(-0.3357668519020081, 0.2093413323163986, 0.4355416595935822, 0.1550502777099609, -0.6510964035987854, -0.1751857399940491, -0.2060168534517288, -0.1710205078125000, -0.1202360317111015, -0.2500316798686981, 0.1074745431542397, -0.2418434321880341, 0.0133954072371125, -0.0555886104702950, 0.1514673978090286, 0.2739115655422211));
	target1 += mul(mc1, float4x4(-0.3006273508071899, -0.2699472010135651, -0.1982013583183289, -0.0032952548936009, 0.0307833012193441, 0.3671586215496063, -0.0966020002961159, -0.2836556434631348, 0.4297264218330383, 0.6171903610229492, 0.6723483800888062, 0.2705117464065552, -0.1438141316175461, -0.0873940736055374, -0.7001031041145325, -0.2052250355482101));
	target1 += mul(mc2, float4x4(-0.2875024676322937, -1.6230558156967163, -0.6733398437500000, -0.9642448425292969, -0.1964960694313049, 0.2485812455415726, 0.1236900389194489, -1.1423941850662231, -0.0412602946162224, 0.3412002623081207, 0.3962794244289398, -0.2490761876106262, -0.0058065578341484, -0.4578708708286285, -0.2418260127305984, 0.5357795953750610));
	target1 += mul(bc1, float4x4(0.0062361713498831, 0.1925230026245117, 0.0824977159500122, 0.0561275146901608, 0.0929671525955200, 0.0698546022176743, 0.3816939592361450, 0.0395248420536518, -0.0719512030482292, 0.0564917400479317, -0.1297784000635147, 0.1245511695742607, 0.0012355837970972, -0.0990515723824501, 0.4213519692420959, -0.1645816713571548));
	target1 += mul(bc2, float4x4(-0.0611936338245869, -0.0220258161425591, -0.0040935277938843, -0.1060328409075737, -0.0583154149353504, -0.0171997752040625, 0.1058546081185341, 0.2793170809745789, -0.2339317053556442, -0.1972009539604187, -0.0600687190890312, -0.0684379041194916, 0.0243016034364700, -0.2111079394817352, -0.2042971849441528, 0.0724857896566391));
	target1 += mul(tr1, float4x4(-0.0833447948098183, -0.0533220991492271, 0.0767802372574806, 0.1182348504662514, -0.0223299078643322, -0.0479344800114632, -0.0119727496057749, 0.0524821877479553, -0.0334780365228653, 0.0719002187252045, 0.0439689308404922, 0.0475181229412556, 0.0764308497309685, 0.0086713796481490, -0.1700707823038101, 0.06573542952537547));
	target1 += mul(tr2, float4x4(0.1391696482896805, 0.0739523395895958, 0.0565792545676231, -0.0430364646017551, 0.0943084582686424, 0.0102064209058881, 0.0120795257389545, -0.0841303989291191, 0.1573246121406555, 0.0164279472082853, 0.0988841354846954, -0.1430613398551941, -0.0572808869183064, -0.0844292491674423, 0.0621565617620945, 0.0923799052834511));
	target1 += mul(mr1, float4x4(-0.1223107874393463, -0.2441930323839188, -0.2410650849342346, -0.0162935722619295, 0.0695567727088928, -0.0028583710081875, -0.0059417244046926, 0.0715164169669151, -0.0668491795659065, -0.1499572396278381, 0.0869924053549767, 0.0553652904927731, 0.2729566097259521, 0.1370039582252502, -0.1282183527946472, -0.1451860070228577));
	target1 += mul(mr2, float4x4(0.1331952214241028, 0.0021079662255943, -0.1116734445095062, -0.4168601930141449, 0.0534659475088120, 0.0037860786542296, -0.0366065911948681, 0.1047701835632324, 0.1491260826587677, 0.0782341659069061, 0.0949895009398460, -0.1160908639431000, -0.1057133302092552, -0.2699718773365021, -0.1193305626511574, 0.2142304331064224));
	target1 += mul(br1, float4x4(0.0041565205901861, -0.1065499857068062, -0.0629659667611122, -0.1144768893718719, 0.0318886637687683, -0.0562519319355488, 0.0043422472663224, 0.0226082988083363, -0.1456198990345001, -0.2398656159639359, -0.2625046670436859, -0.0710547044873238, 0.0067904205061495, 0.0018544088816270, 0.1019348874688148, -0.0186133962124586));
	target1 += mul(br2, float4x4(0.0732532218098640, 0.1516859829425812, 0.0580205544829369, 0.1968977004289627, -0.0066619524732232, -0.1597842127084732, -0.0990600511431694, -0.1059188917279243, 0.0718481168150902, -0.2222738713026047, -0.1675696671009064, -0.1500017195940018, -0.0568779110908508, -0.0582777932286263, -0.0844587534666061, -0.0263266414403915));
	target1 = max(target1, 0) + float4(-0.2459529191255569, 0.7563464641571045, -0.0705636814236641, -0.0094820559024811) * min(target1, 0);

	float4 target2 = float4(-0.0448397286236286,-0.1649267971515656,-0.1192543581128120,-0.0061073559336364);
	target2 += mul(tl1, float4x4(0.0724840760231018, -0.0480341166257858, -0.1082391515374184, -0.1447021961212158, 0.0723197236657143, 0.0481830574572086, 0.0009448126656935, 0.0353565886616707, -0.0653375908732414, 0.0029647622723132, -0.0016588598955423, -0.2075651884078979, 0.0403469167649746, 0.3929971158504486, 0.0342363268136978, 0.1427230089902878));
	target2 += mul(tl2, float4x4(-0.0743464827537537, 0.1844420731067657, 0.0256296340376139, -0.2808582782745361, 0.0351609662175179, 0.3277008235454559, -0.0205841138958931, -0.5355809330940247, 0.0681906566023827, 0.2058052271604538, -0.0479847639799118, -0.3735262751579285, -0.0261550359427929, -0.1148884072899818, -0.2329017966985703, 0.0728458985686302));
	target2 += mul(ml1, float4x4(-0.1236097738146782, 0.1251334398984909, -0.1339431256055832, 0.0198749266564846, -0.1325920224189758, -2.2431972026824951, -0.0680834427475929, -0.5671764612197876, -0.3431925177574158, -0.0983135104179382, -0.2207138091325760, -0.2374879121780396, 0.0127309206873178, 1.3076044321060181, 0.0848151743412018, -0.1928595900535583));
	target2 += mul(ml2, float4x4(-0.0471093133091927, -0.1513628512620926, -0.0134263765066862, -0.1519252359867096, -0.5260242223739624, 0.2291621714830399, 0.4088975787162781, -0.4315340518951416, 0.0933236032724380, -1.0386694669723511, 0.0015958193689585, -0.2737887501716614, -0.0246253963559866, -0.2722961604595184, -0.1770633459091187, -0.2291279733181000));
	target2 += mul(bl1, float4x4(-0.0017552347853780, 0.1903935521841049, -0.0740704238414764, -0.0917679518461227, 0.0323882810771465, -0.3029108047485352, 0.0532565414905548, -0.0651542618870735, 0.4868686199188232, 0.8539272546768188, 0.4151960313320160, 0.2619662582874298, -0.0413270294666290, 0.1404227763414383, 0.1027320474386215, 0.3274228572845459));
	target2 += mul(bl2, float4x4(0.1828346252441406, 0.0274682324379683, -0.1169882863759995, 0.0327291004359722, 0.1786244213581085, -0.6569546461105347, -0.0609031207859516, -0.1676601022481918, -0.1481092721223831, 0.2889067530632019, 0.1246089115738869, 0.2203597426414490, -0.0366856977343559, 0.1539470851421356, 0.0069492300972342, -0.1544002443552017));
	target2 += mul(tc1, float4x4(0.2073992937803268, -0.0717074573040009, -0.0196173377335072, -0.0956910699605942, 0.0728898122906685, 0.0484567955136299, 0.3063069283962250, -0.3200540542602539, 0.0291527546942234, -0.0265460256487131, 0.1168476045131683, -0.2479970753192902, 0.1224220171570778, 0.0745823010802269, 0.1868897676467896, -0.1958049237728119));
	target2 += mul(tc2, float4x4(0.0019954447634518, -0.0225235987454653, 0.0812198966741562, 0.0295672398060560, -0.2016931176185608, -0.2239151000976562, -0.2481262385845184, -0.2381946444511414, -0.0520484372973442, -0.1200495883822441, 0.2121954560279846, -0.1573531329631805, -0.0198472067713737, 0.1001087054610252, -0.1084884032607079, -0.3126969039440155));
	target2 += mul(mc1, float4x4(0.3838330209255219, 0.1678779572248459, 0.6496244072914124, 0.3783606290817261, -0.2198582738637924, -0.2351343184709549, -0.2852248847484589, 0.6310021877288818, 0.8083020448684692, 0.0039323624223471, -0.0901831910014153, 0.0797894075512886, -0.2271467447280884, 0.7082978487014771, 0.1513756662607193, 0.2188975960016251));
	target2 += mul(mc2, float4x4(-0.2871031761169434, 0.2316448241472244, 0.4947948157787323, 0.3308620452880859, -0.0623455122113228, -0.1314185708761215, -0.2664661705493927, 0.8725078701972961, 0.4541083276271820, 0.1433589160442352, -1.1269453763961792, 0.6427971124649048, -0.1016561388969421, 0.3418317139148712, -0.0991155728697777, -1.0508837699890137));
	target2 += mul(bc1, float4x4(-0.2179604172706604, 0.1258949041366577, -0.1155700981616974, -0.0536149404942989, -0.0140614463016391, -0.0091438721865416, -0.0501774959266186, -0.3570724725723267, -0.5832386016845703, 0.2004123181104660, 0.2986239194869995, -0.8139168024063110, 0.0142666567116976, 0.0681498944759369, 0.1293468028306961, -0.1001938357949257));
	target2 += mul(bc2, float4x4(0.1952836811542511, -0.3092494010925293, 0.3063779771327972, 0.1934849917888641, 0.0746696740388870, -0.3533902466297150, -0.1269576102495193, -0.2237875163555145, 0.2470717132091522, -0.2640363574028015, -0.2862776815891266, 0.1740108281373978, -0.0963631942868233, 0.2631850540637970, 0.0400718413293362, -0.3590607047080994));
	target2 += mul(tr1, float4x4(-0.5299927592277527, 0.0979989692568779, 0.1666737496852875, -0.1547524333000183, -0.0043443185277283, 0.1540203243494034, 0.0594348423182964, -0.0167275425046682, -0.1043610796332359, 0.0504250898957253, 0.0456700921058655, 0.2525034546852112, 0.2241353541612625, -0.1678503304719925, 0.1532667279243469, 0.2901742458343506));
	target2 += mul(tr2, float4x4(0.0998796448111534, 0.0385462641716003, -0.0762400180101395, -0.1255892217159271, 0.0281430184841156, -0.0304958485066891, -0.1440480053424835, -0.1001605167984962, -0.2257689833641052, 0.2056092917919159, 0.0248535349965096, -0.1383949518203735, -0.0951708629727364, 0.0997417271137238, 0.0275330394506454, -0.5728432536125183));
	target2 += mul(mr1, float4x4(0.4256163835525513, 0.1745115518569946, -0.2409395426511765, 0.3139856457710266, -0.0036795330233872, 0.1819283962249756, -0.0864531323313713, 0.0102691333740950, -0.3397279977798462, 0.1107075437903404, -0.0035228815395385, -0.2207705229520798, -0.1779139339923859, -0.2106117755174637, 0.0352664291858673, 0.3615589439868927));
	target2 += mul(mr2, float4x4(-0.0345224253833294, -0.0669926702976227, 0.0907212942838669, -0.3758732676506042, -0.0452554710209370, -0.1134464666247368, -0.0358871109783649, -0.1858227252960205, -0.0233245138078928, -0.0495684742927551, 0.1976234614849091, -0.1165761798620224, -0.0340447537600994, 0.1095624342560768, 0.0110175255686045, -0.8269239664077759));
	target2 += mul(br1, float4x4(-0.1379280686378479, 0.1004267781972885, 0.0723998174071312, -0.1510958224534988, 0.0610648579895496, 0.0451720170676708, -0.0231927260756493, -0.0251553766429424, 0.2306085377931595, 0.1033207178115845, -0.1316205114126205, 0.1130664870142937, -0.0458516106009483, -0.1152514070272446, -0.0088650323450565, -0.0214479379355907));
	target2 += mul(br2, float4x4(-0.0545783303678036, -0.0620098188519478, 0.0347074456512928, 0.1096799224615097, 0.0036664425861090, -0.0413107499480247, 0.1443250179290771, -0.1161036714911461, -0.0061624986119568, -0.0252977479249239, 0.3230019211769104, -0.2536626160144806, -0.0565439648926258, 0.0827583819627762, -0.0071726376190782, -0.1983329951763153));
	target2 = max(target2, 0) + float4(-0.6312188506126404, -0.1215368881821632, 0.2487443536520004, 0.4051703512668610) * min(target2, 0);

	tex3[gxy] = target1;
	tex4[gxy] = target2;
}


//!PASS 4
//!DESC mapping 3
//!IN tex3, tex4
//!OUT tex1, tex2
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass4(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [tl, tc, tr]
	// [ml, mc, mr]
	// [bl, bc, br]
	float4 tl1 = tex3.SampleLevel(sam, pos - inputPt, 0);
	float4 ml1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc1 = tex3.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc1 = tex3.SampleLevel(sam, pos, 0);
	float4 bc1 = tex3.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br1 = tex3.SampleLevel(sam, pos + inputPt, 0);

	float4 tl2 = tex4.SampleLevel(sam, pos - inputPt, 0);
	float4 ml2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc2 = tex4.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc2 = tex4.SampleLevel(sam, pos, 0);
	float4 bc2 = tex4.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br2 = tex4.SampleLevel(sam, pos + inputPt, 0);

	float4 target1 = float4(-0.0410279631614685,-0.1111723631620407,-0.0406232848763466,-0.0939496159553528);
	target1 += mul(tl1, float4x4(0.1221675798296928, 0.0083215842023492, -0.0162804014980793, 0.0316714197397232, -0.2205813378095627, 0.1500435769557953, 0.2109555304050446, 0.2741867899894714, 0.0956874340772629, -0.0896854698657990, -0.1657065600156784, -0.1349759399890900, 0.0601499564945698, -0.1523845940828323, -0.1828087568283081, -0.2727653682231903));
	target1 += mul(tl2, float4x4(-0.0918163508176804, 0.1564485579729080, 0.1133174449205399, 0.2215953171253204, -0.0623677000403404, -0.0497728772461414, -0.0372809022665024, -0.0258478187024593, -0.1364922970533371, 0.1053884625434875, 0.3292874991893768, 0.2693256139755249, -0.0347631797194481, -0.1470523178577423, 0.0096792401745915, -0.0542853325605392));
	target1 += mul(ml1, float4x4(0.1331177949905396, -0.0964357852935791, -0.0706946700811386, 0.1593225002288818, -0.4815943241119385, 0.1224092170596123, -0.0870430991053581, 0.0005010276800022, -0.0242684502154589, -0.2256436049938202, 0.1367238312959671, 0.0474774017930031, 0.6886650323867798, -0.0065326127223670, 0.1841574758291245, -0.1354993879795074));
	target1 += mul(ml2, float4x4(-0.1049591675400734, 0.0515934228897095, 0.1128631457686424, 0.1688040047883987, -0.0084041170775890, -0.0006375144002959, -0.0598374009132385, 0.1424416452646255, -0.0048398924991488, 0.1832167655229568, 0.0231959503144026, 0.0816788375377655, -0.1321710795164108, 0.0397678017616272, -0.0058345394209027, 0.5784573554992676));
	target1 += mul(bl1, float4x4(0.1438693851232529, -0.0694608166813850, -0.0428275354206562, 0.1599996536970139, -0.1651254445314407, 0.1388883888721466, -0.0895452573895454, 0.2569831907749176, 0.3150432109832764, -0.0910519883036613, 0.0367441214621067, 0.1903669685125351, 0.2805841267108917, -0.0444608181715012, 0.0059385276399553, -0.2585869431495667));
	target1 += mul(bl2, float4x4(-0.1217494234442711, 0.0191769022494555, -0.0065453462302685, 0.1391217857599258, 0.0998920649290085, -0.0162798929959536, 0.0502282194793224, 0.0370145924389362, 0.0290782172232866, -0.0099554909393191, 0.0142515478655696, 0.1248661577701569, -0.0076912571676075, 0.0251651499420404, 0.2190572917461395, 0.0020069130696356));
	target1 += mul(tc1, float4x4(0.2666685581207275, -0.1625511497259140, -0.3938800692558289, -0.0253848694264889, 0.0987015441060066, 0.2033616453409195, 0.3128099143505096, 0.4608893990516663, 0.0620003379881382, -0.1389972567558289, -0.3095863461494446, -0.4023511111736298, -0.1105777546763420, 0.1115406602621078, 0.3639950752258301, 0.0645622834563255));
	target1 += mul(tc2, float4x4(-0.2135885655879974, -0.1035343706607819, 0.1795026361942291, 0.1828210204839706, 0.0780984908342361, 0.0656728670001030, 0.0033678691834211, 0.1361345648765564, 0.1712654232978821, -0.0172833092510700, -0.0502183400094509, 0.2910411655902863, 0.0691247656941414, 0.1935720741748810, 0.0652214139699936, 0.1608240753412247));
	target1 += mul(mc1, float4x4(0.8243460655212402, -0.0979344248771667, -0.0366373993456364, 0.1692261099815369, 0.5517869591712952, 0.3282494544982910, -0.7905511856079102, -0.4462923705577850, -0.0803156569600105, 0.1172509342432022, 0.1864327639341354, 0.1471016854047775, 0.1296005547046661, -0.1004103720188141, 0.3174172043800354, -0.1181766316294670));
	target1 += mul(mc2, float4x4(0.0259374529123306, -0.0934808850288391, 0.3008874654769897, 0.3957927823066711, -0.4048821926116943, 0.1461934000253677, -0.1819096356630325, -0.1908810287714005, 0.3193186521530151, -0.7438099980354309, 0.1919509470462799, -0.2065188735723495, 0.1752236187458038, -0.6840037107467651, 0.1588519066572189, -0.3956064879894257));
	target1 += mul(bc1, float4x4(0.1574442386627197, -0.0114925103262067, -0.1208277940750122, 0.2058266401290894, 0.2879209220409393, -0.0419875606894493, -0.1902059614658356, -0.2723863720893860, -0.1086223348975182, -0.0870924964547157, 0.8605937957763672, 0.2656622231006622, -0.1653763055801392, 0.0816384851932526, -0.0137870563194156, 0.1433854848146439));
	target1 += mul(bc2, float4x4(-0.1565909236669540, -0.0307490080595016, -0.1055604666471481, 0.2573592662811279, -0.1186821162700653, 0.1141471788287163, -0.0272745657712221, -0.1049114838242531, 0.2445316016674042, -0.0027864547446370, -0.1759569346904755, -0.1556979566812515, 0.0550616309046745, 0.1704383641481400, 0.0853662937879562, 0.3280856907367706));
	target1 += mul(tr1, float4x4(0.1460669338703156, 0.4202052652835846, -0.3638312816619873, -0.0958623066544533, -0.0492525361478329, -0.3664234280586243, 0.0794373303651810, 0.0399017669260502, 0.0629198029637337, 0.1662959158420563, -0.1001493930816650, -0.0587460733950138, -0.0396478697657585, 0.0017320754704997, 0.0314909480512142, -0.0202700830996037));
	target1 += mul(tr2, float4x4(-0.0964399129152298, 0.0380319654941559, 0.0396055467426777, 0.0265473183244467, -0.0161637403070927, -0.1872924566268921, 0.1670000404119492, 0.0029466480482370, -0.1093841269612312, -0.3629201948642731, -0.0562992505729198, 0.1792684197425842, -0.0203859098255634, 0.0983991250395775, 0.0058611719869077, 0.1627455651760101));
	target1 += mul(mr1, float4x4(-0.1117974221706390, 0.7562329173088074, -0.2046248912811279, 0.1677842289209366, -0.2063486129045486, -0.6023545265197754, -0.5739209651947021, 0.5110496878623962, -0.0715268924832344, -0.1373793482780457, 0.1251420378684998, -0.0477442294359207, 0.4961377978324890, 0.2688887119293213, 0.3146316707134247, -0.5197153687477112));
	target1 += mul(mr2, float4x4(-0.1314805448055267, 0.0746279135346413, 0.3457699418067932, 0.2564856410026550, 0.0839370116591454, -0.6136511564254761, -0.4646295011043549, 0.0612256154417992, -0.1910563558340073, -0.0935136750340462, -0.2426030039787292, 0.2102959007024765, 0.1575350016355515, 0.6145061254501343, 0.3368154168128967, -0.0974092856049538));
	target1 += mul(br1, float4x4(0.0565315335988998, 0.2393359094858170, -0.0932938233017921, 0.1555283814668655, 0.0123879108577967, -0.1247719228267670, -0.0564610138535500, -0.1125799044966698, -0.0104600470513105, 0.0482629500329494, 0.2316472232341766, 0.1083717569708824, -0.0525921434164047, 0.0643989592790604, -0.0525734610855579, -0.0503251366317272));
	target1 += mul(br2, float4x4(-0.1835366338491440, 0.0978360474109650, -0.1111819595098495, 0.2109299153089523, 0.0509372949600220, -0.1992686837911606, 0.0677929744124413, -0.0870024710893631, -0.0412262082099915, -0.0697719156742096, -0.0967373847961426, 0.0137308547273278, 0.0195730421692133, 0.0410240143537521, 0.1157210171222687, 0.2283479571342468));
	target1 = max(target1, 0) + float4(0.1991519331932068, -0.1275756657123566, -0.0622864030301571, 0.1586369574069977) * min(target1, 0);

	float4 target2 = float4(-0.0089084329083562,-0.0336172059178352,0.0177190825343132,0.0529975406825542);
	target2 += mul(tl1, float4x4(-0.0275970958173275, 0.0141968233510852, 0.1181544512510300, -0.0572245270013809, 0.1161347925662994, -0.1156444773077965, -0.2549640238285065, 0.0882879272103310, -0.0715355500578880, 0.0151285668835044, 0.1079384386539459, 0.0650847703218460, -0.1597152203321457, 0.0669793561100960, 0.2084401696920395, -0.0951152443885803));
	target2 += mul(tl2, float4x4(0.0404323227703571, -0.0206144321709871, -0.1080420613288879, -0.2038477361202240, 0.0248847268521786, -0.0064681121148169, 0.0389525443315506, 0.0011026862775907, 0.0885242074728012, 0.0295896343886852, -0.3323790132999420, 0.1935138553380966, -0.0466548874974251, 0.1023886054754257, 0.1257870644330978, -0.1541756242513657));
	target2 += mul(ml1, float4x4(-0.0076520540751517, 0.0361139886081219, 0.1749804913997650, -0.2051989138126373, 0.0022692133206874, -0.0282937753945589, -0.2039019316434860, -0.2343468815088272, -0.0357327871024609, -0.0570764988660812, 0.2925858795642853, -0.1988349705934525, -0.0584560707211494, -0.0341510921716690, 0.1300961822271347, 0.5184492468833923));
	target2 += mul(ml2, float4x4(0.0884973928332329, 0.0333527140319347, 0.0180535931140184, -0.2655122876167297, 0.0433661043643951, 0.0104369185864925, 0.0010909073753282, -0.0705273598432541, -0.0602585524320602, 0.2420269846916199, -0.4731841087341309, -0.8040290474891663, 0.3066828548908234, -0.2466925680637360, 0.0938910692930222, -0.2002603262662888));
	target2 += mul(bl1, float4x4(0.0549152903258801, 0.0291299298405647, 0.0946277007460594, -0.0581608228385448, 0.0669180899858475, -0.0635575056076050, -0.2427970170974731, -0.2677550315856934, 0.2226776182651520, 0.1301570236682892, -0.1519709974527359, 0.0671724304556847, -0.0526433289051056, 0.1898351758718491, 0.2383745312690735, 0.21917118132114417));
	target2 += mul(bl2, float4x4(-0.0234222635626793, 0.0238620284944773, 0.0427630320191383, -0.1080563366413116, 0.0332126952707767, -0.0039051575586200, 0.0293126031756401, 0.0161924213171005, 0.0453971028327942, 0.0131999952718616, -0.0689036697149277, 0.2349009960889816, 0.1013344153761864, 0.2706570029258728, 0.1191426888108253, -0.2830821871757507));
	target2 += mul(tc1, float4x4(0.0181465242058039, -0.0571886636316776, 0.4875229001045227, -0.4244020283222198, 0.4331104159355164, 0.1066712513566017, -0.5277034044265747, 0.1110567077994347, -0.1179447323083878, -0.0273578558117151, 0.1798476576805115, -0.2829602360725403, 0.1012385115027428, -0.2528488039970398, 0.1697608679533005, 0.1121710017323494));
	target2 += mul(tc2, float4x4(-0.1404130905866623, -0.0984055623412132, -0.0279541295021772, -0.1321212500333786, -0.0841855704784393, 0.1336171030998230, -0.1458790600299835, -0.0044095455668867, 0.2203754037618637, 0.1455714553594589, -0.2362042963504791, -0.0329121425747871, -0.1683547794818878, 0.0289597529917955, 0.3424547612667084, 0.0143845872953534));
	target2 += mul(mc1, float4x4(0.0287246014922857, 0.1948280781507492, 0.5998955368995667, 0.1192114129662514, -0.6269109249114990, 0.8724324703216553, -0.6399638652801514, -0.4201497733592987, -0.3355066180229187, -0.1566904038190842, -0.4396412074565887, 0.1525828838348389, 0.5573399066925049, 0.2324324846267700, 0.2762884795665741, 0.0406046211719513));
	target2 += mul(mc2, float4x4(0.3890096545219421, -0.0574061162769794, -0.1468243300914764, -0.5953360199928284, -0.1363215148448944, -0.2224670499563217, -0.2237723320722580, 0.2738097012042999, -0.4868114292621613, -0.5029351711273193, -0.3570256233215332, -0.1776263266801834, -0.0176672954112291, -0.4318660795688629, 1.0395888090133667, 0.1728395074605942));
	target2 += mul(bc1, float4x4(0.1337304115295410, -0.0809440389275551, 0.1600498855113983, -0.1108811497688293, -0.2376178801059723, -0.1532768607139587, -0.0447455830872059, 0.2515332102775574, 0.4848278462886810, -0.0915748402476311, -0.0336527302861214, -0.2141884714365005, 0.2125129699707031, 0.3237875998020172, 0.0022272330243140, -0.0167857185006142));
	target2 += mul(bc2, float4x4(0.0457934997975826, 0.0510537698864937, -0.0519523508846760, -0.4506326615810394, -0.1029204949736595, 0.0116113182157278, -0.1750748157501221, -0.0048758201301098, 0.1506977379322052, 0.0633068457245827, -0.1628549993038177, -0.0144928665831685, 0.1408756822347641, 0.2896180152893066, 0.0803691521286964, -0.4930096566677094));
	target2 += mul(tr1, float4x4(-0.0484248884022236, 0.1371297985315323, -0.1235475391149521, -0.2618594765663147, -0.0280395895242691, 0.0248795989900827, 0.1204105168581009, 0.3246576189994812, 0.0426272377371788, -0.0520061068236828, 0.0575957447290421, -0.2613646090030670, 0.1165295541286469, -0.0390013493597507, -0.0470846109092236, -0.0014663023175672));
	target2 += mul(tr2, float4x4(-0.1066762879490852, -0.0869804695248604, -0.0099332248792052, -0.1355892717838287, -0.0760413780808449, 0.1377770304679871, -0.0263407956808805, 0.0880135521292686, 0.1496269851922989, -0.0487459264695644, 0.1286851912736893, 0.2218491584062576, 0.1723349541425705, -0.0165541302412748, -0.0690477639436722, -0.2388458102941513));
	target2 += mul(mr1, float4x4(-0.4236431121826172, 0.0465179122984409, -0.1526456624269485, 0.1426440477371216, 0.5913932919502258, -0.1082349196076393, 0.2731275856494904, -0.2687640488147736, -0.4628683030605316, -0.0537119321525097, -0.1597615629434586, 0.0528527684509754, -0.3485085070133209, 0.1395110934972763, 0.0642972290515900, 0.0323829315602779));
	target2 += mul(mr2, float4x4(0.0066713397391140, -0.0482029877603054, -0.1707276403903961, -0.1001396998763084, 0.0539822019636631, -0.1624453216791153, 0.4913550019264221, 0.3687861263751984, 0.0491421781480312, 0.1311376541852951, 0.0992425829172134, -0.4636098444461823, -0.3415873646736145, -0.0153833786025643, -0.0270162131637335, -0.0935514941811562));
	target2 += mul(br1, float4x4(-0.1738258153200150, 0.0458541549742222, -0.0653749182820320, -0.0156540926545858, -0.0357586294412613, -0.1486178338527679, 0.1798035055398941, -0.1310307979583740, 0.0783249065279961, -0.0261360015720129, -0.1047066971659660, 0.3385537564754486, -0.0339452810585499, 0.2299628853797913, -0.1408322304487228, -0.0352708548307419));
	target2 += mul(br2, float4x4(0.0463018082082272, 0.0565674640238285, -0.0538956597447395, -0.2354862987995148, 0.0297824125736952, 0.0307939313352108, 0.1271791011095047, -0.1025698855519295, 0.1060482114553452, -0.0703211054205894, -0.0083062350749969, 0.0474255047738552, 0.0442508421838284, 0.1569559425115585, -0.0442709513008595, -0.1188704669475555));
	target2 = max(target2, 0) + float4(0.7366524934768677, 1.0013850927352905, -0.0276311747729778, 0.0734841898083687) * min(target2, 0);

	tex1[gxy] = target1;
	tex2[gxy] = target2;
}


//!PASS 5
//!DESC mapping 4, sub-band residuals
//!IN tex1, tex2, featureMap1, featureMap2
//!OUT tex3, tex4
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass5(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [tl, tc, tr]
	// [ml, mc, mr]
	// [bl, bc, br]
	float4 tl1 = tex1.SampleLevel(sam, pos - inputPt, 0);
	float4 ml1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc1 = tex1.SampleLevel(sam, pos, 0);
	float4 bc1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br1 = tex1.SampleLevel(sam, pos + inputPt, 0);

	float4 tl2 = tex2.SampleLevel(sam, pos - inputPt, 0);
	float4 ml2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc2 = tex2.SampleLevel(sam, pos, 0);
	float4 bc2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br2 = tex2.SampleLevel(sam, pos + inputPt, 0);

	float4 c1 = { -0.1306160986423492,-0.0808217376470566,-0.2880123555660248,0.0099629526957870 };
	c1 += mul(tl1, float4x4(-0.1033539846539497, 0.0541300140321255, -0.0804840475320816, -0.0334571413695812, -0.0264753755182028, 0.1118840202689171, 0.1186013221740723, -0.0127575425431132, 0.2236593365669250, 0.0025286162272096, 0.0985530614852905, 0.0685181617736816, -0.1884875595569611, 0.0530862808227539, -0.0482063069939613, 0.0375233069062233));
	c1 += mul(tl2, float4x4(0.1837068796157837, -0.0632847175002098, 0.0016613919287920, 0.0392861217260361, 0.2923883199691772, -0.1713902205228806, 0.1907587945461273, 0.0550456829369068, 0.0644215345382690, -0.1046456992626190, 0.0187383033335209, 0.0770180150866508, 0.1933846622705460, -0.0455715768039227, 0.0375007353723049, -0.1053109914064407));
	c1 += mul(ml1, float4x4(-0.0972480997443199, 0.2820451855659485, 0.0114549007266760, -0.0954328626394272, 0.0706252008676529, 0.4829064607620239, -0.6371517181396484, 0.0005180989392102, 0.3280143439769745, 0.0665246024727821, -0.0503116399049759, -0.1261110603809357, 0.1114177703857422, -0.2053108513355255, 0.1428771317005157, 0.3926100134849548));
	c1 += mul(ml2, float4x4(-0.2571723163127899, 0.1627264618873596, -0.4940335154533386, -0.1361546218395233, 0.0804422944784164, -0.4231885373592377, 0.0650202706456184, 0.0518481098115444, -0.0502478554844856, -0.1305799931287766, 0.1814480125904083, 0.0090866927057505, -0.0510044656693935, -0.1691461503505707, 0.0922467112541199, -0.0314207412302494));
	c1 += mul(bl1, float4x4(0.1270498335361481, 0.0563284493982792, -0.0435525141656399, 0.1569847911596298, 0.0576847903430462, 0.3461692929267883, -0.0325655154883862, -0.2688976824283600, -0.1341977864503860, -0.1382253766059875, 0.2293784171342850, -0.1111817285418510, -0.1402447521686554, -0.3257531225681305, 0.0598510466516018, 0.1008039116859436));
	c1 += mul(bl2, float4x4(0.1698816716670990, 0.3491003513336182, -0.1367681026458740, -0.1165873408317566, -0.2091718912124634, -0.1487034261226654, -0.0569749698042870, -0.2100717276334763, 0.0404917001724243, -0.1372035890817642, 0.0689046755433083, -0.0367818064987659, -0.0325474888086319, -0.0114965448155999, -0.0137249026447535, -0.0279692262411118));
	c1 += mul(tc1, float4x4(-0.0563433989882469, 0.0132494345307350, -0.2434540390968323, 0.0796563774347305, -0.2109155058860779, 0.0387088693678379, -0.0591037571430206, 0.0955820381641388, 0.4660535752773285, -0.1204202473163605, 0.1332369595766068, -0.0285425651818514, -0.3886952698230743, -0.0434980578720570, -0.0849134400486946, 0.0802380964159966));
	c1 += mul(tc2, float4x4(0.0412235632538795, 0.1571959257125854, 0.2050069272518158, -0.1138664111495018, 0.1962715685367584, 0.0594439841806889, 0.0351715497672558, -0.0129811102524400, 0.2055217623710632, -0.0647534057497978, 0.0373471938073635, 0.0877277255058289, -0.5734645724296570, 0.1188675239682198, -0.1145943328738213, -0.1182733029127121));
	c1 += mul(mc1, float4x4(-0.2004909217357635, -0.4817073047161102, 0.5596802830696106, -0.0327854752540588, 0.0989314392209053, 0.4127818942070007, 0.7265836596488953, -0.2692042589187622, 0.5195841789245605, -0.2357539832592010, -0.3819393217563629, 0.1755530238151550, 0.6578183770179749, 0.1075539961457253, -0.2688144743442535, 0.3242723941802979));
	c1 += mul(mc2, float4x4(-0.3221310675144196, 0.2978510260581970, 0.2269985526800156, -0.3184116482734680, 0.4845580160617828, 0.4407236874103546, 0.0099756307899952, -0.3121858239173889, -0.3810067176818848, -0.0553649961948395, 0.0202834140509367, 0.0409953594207764, 0.2532750964164734, 0.2731618583202362, 0.1237529441714287, 0.0134243080392480));
	c1 += mul(bc1, float4x4(0.1835541725158691, 0.0549701862037182, -0.1749316602945328, -0.2030028849840164, 0.0263462308794260, 0.2781440317630768, 0.0372458845376968, 0.3643021881580353, -0.4047883749008179, 0.0660117194056511, 0.4863115549087524, -0.2024163603782654, -0.6403482556343079, 0.2765505611896515, 0.1417075097560883, 0.5064445734024048));
	c1 += mul(bc2, float4x4(0.6106975078582764, -0.1570862233638763, -0.3223383128643036, -0.2497926801443100, -0.4854303300380707, 0.0132978223264217, -0.0609334111213684, 0.1285556703805923, -0.1412864029407501, -0.1379042416810989, -0.0258826259523630, 0.1357705891132355, -0.1285902857780457, -0.0577826797962189, 0.0550044551491737, 0.1717510819435120));
	c1 += mul(tr1, float4x4(0.1389609426259995, 0.0835867226123810, 0.0309768319129944, -0.0278116948902607, -0.0390677824616432, -0.0111810686066747, -0.0025318188127130, 0.0069569633342326, 0.0347319357097149, 0.0191543344408274, 0.0314339138567448, -0.0228427499532700, 0.0416300334036350, 0.0249234102666378, 0.1210031509399414, 0.1142473593354225));
	c1 += mul(tr2, float4x4(0.0607251487672329, 0.0386395826935768, -0.0219341218471527, -0.1102298423647881, 0.1487188935279846, 0.0602982006967068, -0.0280748903751373, -0.0211924221366644, 0.0042894422076643, -0.0269144997000694, 0.0814756453037262, -0.0314031280577183, -0.0213186051696539, -0.1362965404987335, 0.0382767543196678, -0.0669511556625366));
	c1 += mul(mr1, float4x4(-0.2397561967372894, 0.3023172020912170, -0.2398054003715515, 0.0041919997893274, -0.1016605198383331, -0.1521034836769104, -0.1526568531990051, 0.0272433310747147, 0.0741761848330498, 0.1116370111703873, 0.1149727106094360, -0.0809784531593323, -0.1448147594928741, -0.0943927690386772, -0.0086280042305589, 0.1243222951889038));
	c1 += mul(mr2, float4x4(-0.0469366572797298, -0.1655988991260529, -0.1029584184288979, -0.1347874104976654, 0.2064601778984070, 0.0521226711571217, -0.1366733759641647, -0.0041872998699546, 0.1077186539769173, 0.0184442866593599, -0.2309073060750961, -0.1637075096368790, -0.0417953692376614, -0.3190860450267792, -0.1593534499406815, 0.0136412177234888));
	c1 += mul(br1, float4x4(0.1698798984289169, 0.0232755411416292, -0.0876034423708916, -0.3008348643779755, 0.0789884999394417, 0.0034748215693980, -0.0064704762771726, 0.0057828431017697, -0.0190630126744509, -0.0334153175354004, -0.0195646341890097, 0.0105131156742573, 0.0995147302746773, -0.3130289018154144, -0.0724022984504700, 0.0113303456455469));
	c1 += mul(br2, float4x4(-0.0027791252359748, -0.0193455871194601, -0.0415000133216381, 0.0568981170654297, -0.2745247483253479, 0.1222846284508705, 0.1899162530899048, 0.1067754998803139, -0.0561975166201591, -0.1500336527824402, 0.0526139959692955, -0.3491798937320709, -0.0692384615540504, -0.0307095069438219, 0.0498757846653461, 0.0019003645284101));
	c1 = max(c1, 0) + float4(0.1552927196025848, 0.0782765746116638, 0.7966942191123962, -1.1619627475738525) * min(c1, 0);

	float4 c2 = { -0.1443098634481430,-0.1343899369239807,-0.0624338127672672,-0.1094277128577232 };
	c2 += mul(tl1, float4x4(-0.0689977407455444, -0.1693786680698395, 0.0109281269833446, 0.0609922930598259, 0.0296908002346754, 0.1195700988173485, -0.0694077461957932, 0.0971287414431572, 0.0253518298268318, 0.1213042959570885, 0.0703809782862663, 0.0055739870294929, -0.1595942378044128, -0.1336689442396164, -0.0622441768646240, -0.0428023114800453));
	c2 += mul(tl2, float4x4(0.0860001668334007, -0.0226618759334087, 0.1602241247892380, 0.0431661494076252, 0.1526461094617844, 0.2752982378005981, 0.0960300788283348, -0.0536719262599945, -0.0171773489564657, 0.0457364916801453, -0.0360932648181915, -0.0397153608500957, -0.0277090407907963, 0.0729821547865868, -0.0145150292664766, 0.0252893269062042));
	c2 += mul(ml1, float4x4(-0.1407091915607452, -0.4007499516010284, -0.0302001200616360, -0.0606933943927288, -0.2960600554943085, -0.2263117432594299, 0.0721478462219238, -0.4578711986541748, 0.0960150733590126, -0.1606502830982208, 0.2444226741790771, 0.0000882153908606, 0.1472496986389160, 0.3256779909133911, -0.2132861614227295, 0.0339313484728336));
	c2 += mul(ml2, float4x4(-0.1477648764848709, -0.1487885862588882, -0.1973863691091537, 0.0717295333743095, 0.0843430235981941, 0.6259996294975281, -0.1214931011199951, -0.1274987608194351, 0.2359549105167389, 0.3002171218395233, -0.0825233608484268, -0.0157950688153505, 0.0706149637699127, 0.1762917637825012, -0.0611497573554516, -0.0859689489006996));
	c2 += mul(bl1, float4x4(0.0174895934760571, -0.0567042417824268, 0.0409146919846535, 0.0258173532783985, 0.1421577036380768, 0.1234543323516846, -0.1721662431955338, 0.1492216140031815, 0.1100751459598541, 0.0501539446413517, 0.1100447699427605, -0.1086079254746437, -0.0608497932553291, 0.0087817469611764, 0.0714464113116264, -0.1285197436809540));
	c2 += mul(bl2, float4x4(-0.0017177806003019, -0.1463395059108734, -0.1085453778505325, 0.1650195866823196, 0.0813829153776169, 0.1102061793208122, -0.0578421875834465, -0.0232036896049976, -0.1239888817071915, 0.0155465165153146, 0.1079114526510239, -0.0420837886631489, -0.0775837749242783, 0.0148941157385707, -0.0502299368381500, -0.0654754191637039));
	c2 += mul(tc1, float4x4(0.0918162539601326, 0.0440697595477104, -0.0515748932957649, 0.0417411290109158, 0.0353216230869293, 0.1535954177379608, 0.0439723692834377, -0.1288845241069794, 0.1076577678322792, -0.1306740194559097, 0.0715952813625336, -0.0681907683610916, -0.3798767924308777, 0.1023928597569466, -0.0970670804381371, 0.0077168666757643));
	c2 += mul(tc2, float4x4(0.0634560957551003, -0.0550306066870689, 0.2073986232280731, 0.0520241297781467, 0.1162287592887878, -0.2218665480613708, 0.3199682831764221, 0.0606246069073677, -0.0058511858806014, -0.0667045339941978, -0.0449917949736118, 0.0707788690924644, -0.3323366343975067, -0.0763893201947212, -0.0997853428125381, -0.1181001588702202));
	c2 += mul(mc1, float4x4(-0.3101258873939514, 0.2616009712219238, 0.0584651045501232, 0.1656491309404373, -0.0069236233830452, 0.2573371529579163, -0.1793291717767715, -0.2718756198883057, 0.0953581258654594, 0.0524105131626129, 0.1183085516095161, 0.0583294369280338, 0.5036848187446594, -0.5763167142868042, -0.2119628041982651, -0.3140562772750854));
	c2 += mul(mc2, float4x4(-0.2497755438089371, -0.0146329319104552, -0.2741575539112091, 0.2459975033998489, 0.3562706708908081, -0.6528629064559937, -0.4287456274032593, 0.2055913358926773, 0.1739019453525543, -0.3855968713760376, -0.0958273336291313, -0.7066691517829895, 0.2365748286247253, -0.3046728968620300, -0.2590373754501343, -0.0496727414429188));
	c2 += mul(bc1, float4x4(-0.0844531357288361, -0.0321611948311329, -0.0951840654015541, 0.0577518045902252, -0.1606003493070602, 0.2776086628437042, -0.1355003118515015, -0.0880064144730568, -0.1277643740177155, -0.0514567233622074, 0.1522682905197144, -0.1040910631418228, -0.2767944037914276, -0.1452194601297379, 0.0089118303731084, 0.0231996178627014));
	c2 += mul(bc2, float4x4(0.2603267133235931, 0.0167464651167393, -0.2064073234796524, 0.1782064288854599, 0.4890212416648865, 0.0559245310723782, 0.1221160590648651, -0.0202587731182575, -0.4056585729122162, -0.1839511841535568, 0.2775998413562775, 0.0024275144096464, -0.2624500989913940, -0.0619418807327747, 0.0153478365391493, 0.0123427547514439));
	c2 += mul(tr1, float4x4(0.0816635638475418, -0.0134946266189218, 0.0594766475260258, -0.0551253929734230, 0.0134431896731257, -0.0652195811271667, -0.0563635528087616, -0.0066532371565700, -0.0004114551993553, 0.0105680683627725, 0.1324467360973358, 0.0467248968780041, 0.0301312971860170, -0.1073397025465965, -0.0363437235355377, -0.0474153012037277));
	c2 += mul(tr2, float4x4(0.0199097190052271, 0.0901319086551666, 0.0448978282511234, 0.0505443066358566, 0.0438878424465656, -0.0494784042239189, 0.0724927335977554, -0.0070675504393876, -0.0012125011999160, 0.0295279901474714, 0.0705125033855438, 0.0555334389209747, -0.0403393507003784, -0.1271172016859055, 0.0017914215568453, 0.1462216079235077));
	c2 += mul(mr1, float4x4(-0.2827299833297729, 0.2052399665117264, 0.0042732120491564, -0.3969024717807770, -0.0782120972871780, 0.1960176974534988, -0.0675340741872787, 0.0027962317690253, 0.0516129024326801, -0.0352642722427845, 0.0546326488256454, 0.0065340655855834, -0.1062376946210861, 0.1364430636167526, -0.0536947809159756, 0.2098117172718048));
	c2 += mul(mr2, float4x4(0.0045875865034759, 0.2162927240133286, -0.2158576399087906, -0.0047327815555036, 0.1251590698957443, 0.1279677897691727, -0.1188964918255806, 0.0328494384884834, 0.0076038073748350, -0.0561547242105007, 0.0335608273744583, 0.4332321286201477, 0.0021786799188703, 0.0844521671533585, -0.2102309316396713, -0.0189208015799522));
	c2 += mul(br1, float4x4(0.0933093801140785, 0.1548244059085846, -0.0598701611161232, 0.0357220247387886, -0.1141726672649384, 0.0536412484943867, -0.0159156844019890, -0.0445508137345314, 0.1883231997489929, -0.1547038406133652, 0.0530619807541370, 0.0059371814131737, 0.0602529086172581, -0.0435577929019928, 0.0083390390500426, 0.0191930737346411));
	c2 += mul(br2, float4x4(-0.0351041629910469, 0.2119503468275070, -0.0841927304863930, 0.0079463515430689, 0.0683520361781120, -0.1657009869813919, 0.0611055232584476, -0.0063667562790215, 0.0330024957656860, -0.1810818463563919, 0.0872574150562286, 0.1485669612884521, -0.1305806934833527, 0.0041402997449040, 0.0223289318382740, -0.0141495745629072));
	c2 = max(c2, 0) + float4(0.5769761204719543, 0.1716064810752869, -0.0821026712656021, 0.2092144042253494) * min(c2, 0);

	float4 target1 = float4(0.0245648548007011, -0.4467784762382507, 0.0197526942938566, -0.0110000418499112);
	target1 += mul(c1, float4x4(0.0302665308117867, -0.9262221455574036, -0.1161134764552116, -0.0506900474429131, 0.2716045379638672, -0.0485871583223343, 0.0044713355600834, -0.4274623394012451, 0.0749531090259552, -0.3700785338878632, 0.0350039415061474, -0.0540786534547806, -0.0607390031218529, -0.8019900321960449, 0.0923245251178741, 0.1258827745914459));
	target1 += mul(c2, float4x4(-0.0649135261774063, 0.0815236791968346, 0.0067334296181798, 0.1277425885200500, -0.0051357815973461, -0.1485908329486847, 0.0074226572178304, 0.0050623500719666, 0.0588018335402012, -0.0692552924156189, 0.1288725286722183, -0.0989386290311813, 0.0427936837077141, 0.0967708528041840, -0.0455632135272026, -0.0711275041103363));
	target1 += featureMap1.SampleLevel(sam, pos, 0);
	target1 = max(target1, 0) + float4(0.9927186965942383, 0.0570580027997494, 1.3226752281188965, 1.0069466829299927) * min(target1, 0);

	float4 target2 = float4(-0.0425243787467480, -0.3715015351772308, -0.0256227850914001, -0.2774516046047211);
	target2 += mul(c1, float4x4(0.0238118842244148, 0.0295480657368898, -0.0066418983042240, 0.1021223962306976, -0.0568209178745747, -0.4355100393295288, -0.2700522541999817, -0.2060186564922333, -0.0689613372087479, -0.1689691990613937, -0.0306748505681753, -0.2461252212524414, -0.0057375836186111, -0.1892303228378296, -0.0285871494561434, -0.5032613277435303));
	target2 += mul(c2, float4x4(0.5463213324546814, 0.0972800329327583, 0.0307560767978430, 0.0678058937191963, -0.0356063023209572, -0.7013865113258362, 0.1890443563461304, -0.1036657467484474, -0.1745826154947281, -0.2942218780517578, -0.0485423319041729, -0.2983124554157257, -0.0524431839585304, -0.3261034786701202, 0.3217246532440186, 0.1958018541336060));
	target2 += featureMap2.SampleLevel(sam, pos, 0);
	target2 = max(target2, 0) + float4(0.1391339898109436, 0.0960328355431557, 0.6235341429710388, 0.1177272796630859) * min(target2, 0);

	tex3[gxy] = target1;
	tex4[gxy] = target2;
}


//!PASS 6
//!DESC sub-pixel convolution, aggregation 
//!IN tex3, tex4, INPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64

const static float2x3 rgb2uv = {
	-0.169, -0.331, 0.5,
	0.5, -0.419, -0.081
};

const static float3x3 yuv2rgb = {
	1, -0.00093, 1.401687,
	1, -0.3437, -0.71417,
	1, 1.77216, 0.00099
};

void Pass6(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;

	if (!CheckViewport(gxy)) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 outputPt = GetOutputPt();

	float2 pos = ((gxy >> 1) + 0.5f) * inputPt;

	// [tl, tc, tr]
	// [ml, mc, mr]
	// [bl, bc, br]
	float4 tl1 = tex3.SampleLevel(sam, pos - inputPt, 0);
	float4 ml1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc1 = tex3.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc1 = tex3.SampleLevel(sam, pos, 0);
	float4 bc1 = tex3.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br1 = tex3.SampleLevel(sam, pos + inputPt, 0);

	float4 tl2 = tex4.SampleLevel(sam, pos - inputPt, 0);
	float4 ml2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc2 = tex4.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc2 = tex4.SampleLevel(sam, pos, 0);
	float4 bc2 = tex4.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br2 = tex4.SampleLevel(sam, pos + inputPt, 0);

	float4 result = { 0.2010385394096375,0.2058132737874985,0.1918809115886688,0.1961363703012466 };
	result += mul(tl1, float4x4(-0.0005980331334285, -0.0095877395942807, -0.0149448839947581, -0.0026380482595414, 0.0320665836334229, -0.0706205591559410, -0.0054677254520357, 0.0215112231671810, -0.0025710910558701, -0.0000433265340689, 0.0044494951143861, -0.0034823501482606, -0.0050858515314758, 0.0109513988718390, 0.0208286065608263, -0.0032168829347938));
	result += mul(tl2, float4x4(-0.0145305208861828, 0.0246876608580351, -0.0038286084309220, -0.0033089490607381, -0.0920709222555161, -0.0767898634076118, 0.0012083095498383, -0.0751532614231110, 0.0001302754972130, -0.0107085108757019, -0.0010383903281763, -0.0059571005403996, 0.0809685289859772, 0.0414833538234234, 0.0227938480675220, -0.0211347509175539));
	result += mul(ml1, float4x4(0.0160999298095703, 0.0364215746521950, -0.0377063788473606, -0.0449111759662628, -0.0476365163922310, 0.1522845029830933, -0.0131391752511263, -0.0476671792566776, -0.0378389135003090, 0.0235454943031073, 0.0224007442593575, -0.0010372076649219, -0.0089435689151287, -0.0293026417493820, 0.0274190884083509, 0.0469092652201653));
	result += mul(ml2, float4x4(0.0297575183212757, -0.0132508194074035, -0.0044682323932648, -0.0096222748979926, 0.2525918781757355, 0.1873829364776611, -0.5599535703659058, -0.2372044622898102, 0.0033207221422344, 0.0256173480302095, 0.0294605866074562, 0.0323960892856121, -0.1679904460906982, -0.1278967708349228, 0.3168168365955353, 0.1978507637977600));
	result += mul(bl1, float4x4(-0.0047590560279787, -0.0149335600435734, 0.0033453819341958, -0.0012247267877683, 0.1112466752529144, 0.0147760482504964, 0.0031189601868391, 0.0391573049128056, -0.0028154491446912, -0.0036881719715893, -0.0116015253588557, -0.0037573333829641, 0.0047581391409039, 0.0071071563288569, -0.0033221673220396, 0.0004882142529823));
	result += mul(bl2, float4x4(-0.0025197160430253, -0.0018677815096453, 0.0038254233077168, 0.0041981274262071, -0.1321131736040115, -0.0494364202022552, 0.0760654658079147, -0.1386690139770508, -0.0016222692793235, -0.0060105528682470, 0.0010201989207417, 0.0092753591015935, -0.0194614846259356, 0.0087382243946195, -0.0606758072972298, 0.0156162241473794));
	result += mul(tc1, float4x4(-0.0073722628876567, 0.0012844242155552, 0.0241398401558399, -0.0075527969747782, -0.0865194946527481, -0.0610522404313087, 0.0289319511502981, -0.0994452014565468, 0.0281447004526854, -0.0250582899898291, 0.0044891634024680, -0.0246205236762762, 0.0112307453528047, -0.0010844616917893, -0.0223584957420826, 0.0177635718137026));
	result += mul(tc2, float4x4(-0.0585863515734673, 0.0953190475702286, -0.0555586628615856, 0.1033507287502289, 0.1560877263545990, -0.0690897777676582, -0.0341389514505863, -0.0661668032407761, 0.0531073249876499, -0.0266165956854820, -0.0203275382518768, 0.0017760475166142, -0.1300747394561768, 0.1810652017593384, 0.0381597876548767, 0.1397419273853302));
	result += mul(mc1, float4x4(0.6259804368019104, 0.6062518954277039, 0.5450409054756165, 0.5966195464134216, -0.0423948727548122, 0.0760537460446358, -0.0113651463761926, 0.3007817566394806, -0.3218322694301605, 0.2713021934032440, -0.3143473267555237, 0.2303840517997742, 0.3493050038814545, 0.3590726852416992, 0.4138027429580688, 0.3391666412353516));
	result += mul(mc2, float4x4(0.0790478289127350, -0.0978994593024254, 0.0779844969511032, -0.0823706611990929, 0.0094470111653209, 0.1671760678291321, 0.1201528310775757, -0.2016288936138153, 0.3667598366737366, 0.3651430010795593, -0.3612343966960907, -0.2978236973285675, -0.4231655597686768, 0.0091423410922289, -0.1918412446975708, 0.4224558770656586));
	result += mul(bc1, float4x4(-0.0186564289033413, 0.0274957418441772, -0.0064405309967697, 0.0056951809674501, 0.4864942431449890, -0.2563461959362030, 0.4357284605503082, -0.2976118028163910, 0.0374982468783855, 0.0167757049202919, 0.0305800959467888, 0.0232830215245485, 0.0138373551890254, -0.0191283021122217, 0.0032355054281652, 0.0055057541467249));
	result += mul(bc2, float4x4(-0.0276355985552073, 0.0048149987123907, -0.0251619722694159, -0.0057246969081461, 0.0271473955363035, -0.0042668608948588, -0.0594691745936871, 0.2255926281213760, -0.0203660242259502, 0.0721646770834923, 0.0137230781838298, -0.0650938376784325, -0.3049557507038116, 0.2035628110170364, -0.2509683668613434, 0.1962853819131851));
	result += mul(tr1, float4x4(0.0109980758279562, -0.0053752651438117, -0.0112550277262926, 0.0024017230607569, 0.0362104885280132, 0.0084348218515515, -0.0106990104541183, -0.0207723993808031, -0.0014961160486564, 0.0066790678538382, 0.0028113177977502, 0.0025022011250257, -0.0093937022611499, 0.0016421369509771, 0.0035362334456295, -0.0058064293116331));
	result += mul(tr2, float4x4(0.0138889988884330, -0.0078343702480197, 0.0061464929021895, 0.0202130675315857, -0.0257590841501951, -0.0366640128195286, 0.0250097587704659, -0.0498071312904358, -0.0103149358183146, -0.0001786737266229, -0.0099909817799926, 0.0062733208760619, 0.0131437368690968, -0.0005469865864143, -0.0388854071497917, 0.0612070746719837));
	result += mul(mr1, float4x4(0.0052813654765487, 0.0215748809278011, 0.0107395220547915, -0.0079439217224717, 0.0382786765694618, 0.0697424262762070, -0.0415962152183056, 0.0657853558659554, 0.0209470037370920, -0.0218399092555046, -0.0447359494864941, 0.0407319553196430, -0.0040902681648731, -0.0196106657385826, -0.0018554026028141, 0.0203906055539846));
	result += mul(mr2, float4x4(-0.0106181986629963, 0.0084018819034100, 0.0131329754367471, -0.0198754761368036, 0.1117177084088326, 0.0990846082568169, -0.0732304081320763, 0.0163581725209951, -0.0648830309510231, -0.0451613292098045, 0.0206844564527273, 0.0031441387254745, -0.0106161693111062, -0.0567689687013626, 0.0782861113548279, -0.0306094046682119));
	result += mul(br1, float4x4(0.0012452082009986, -0.0026056850329041, -0.0096226977184415, -0.0037850935477763, -0.0190967041999102, 0.0534373670816422, 0.1599360853433609, 0.0834670960903168, -0.0070255175232887, 0.0012873009545729, 0.0030876772943884, -0.0093916896730661, -0.0033529615029693, 0.0043485122732818, 0.0089034689590335, -0.0067489291541278));
	result += mul(br2, float4x4(0.0004713654634543, -0.0034161377698183, -0.0026913962792605, 0.0053522582165897, -0.0040974905714393, 0.0273330621421337, -0.0333138220012188, -0.0701237097382545, 0.0082997502759099, -0.0183656588196754, -0.0122841577976942, -0.0052855615504086, -0.0023795007728040, -0.0438593104481697, -0.1101513057947159, -0.0182559806853533));
	
	float2 originUV = mul(rgb2uv, INPUT.SampleLevel(sam, pos, 0).rgb);

	[unroll]
	for (uint i = 0; i <= 1; ++i) {
		[unroll]
		for (uint j = 0; j <= 1; ++j) {
			const uint2 destPos = gxy + uint2(i, j);

			if (i != 0 || j != 0) {
				if (!CheckViewport(destPos)) {
					continue;
				}
			}

			const uint index = i * 2 + j;
			WriteToOutput(destPos, mul(yuv2rgb, float3(result[index], originUV)));
		}
	}
}
